Set up our necessary packages. Uncomment the install.packages line the first time you run through this. Set your home directory to be the src root of this project. You’ll need to change this before you get started.
Load in all of our data. The modulo of the sum of our UTD IDs was 2, so we will be using Tronix, Omisego, and YoCoin for our analysis.
# First our price files
omg_price_df = read.table("./tokenPrices/omisego.txt",
col.names = c('Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'MarketCap'),
skip = 1,
header = FALSE)
trn_price_df = read.table("./tokenPrices/tron",
col.names = c('Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'MarketCap'),
skip = 1,
header = FALSE)
yoc_price_df = read.table("./tokenPrices/yocoin",
col.names = c('Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'MarketCap'),
skip = 1,
header = FALSE)
# Next our edge files
omg_edge_df <- read_delim('./edgeFiles/omisego.txt', delim = " ", col_names = F)
trn_edge_df <- read_delim('./edgeFiles/tron.txt', delim = " ", col_names = F)
yoc_edge_df <- read_delim('./edgeFiles/yo.txt', delim = " ", col_names = F)
# and label these as well
names(omg_edge_df) <- c('fromID', 'toID', 'unixTime', 'tokenAmount')
names(trn_edge_df) <- c('fromID', 'toID', 'unixTime', 'tokenAmount')
names(yoc_edge_df) <- c('fromID', 'toID', 'unixTime', 'tokenAmount')
Check for duplicated values in all of our files and remove them.
cat("omg_price_df duplicates: ", anyDuplicated(omg_price_df), " \n")
## omg_price_df duplicates: 0
cat("omg_edge_df duplicates: ", anyDuplicated(omg_price_df), " \n")
## omg_edge_df duplicates: 0
omg_price_df <- omg_price_df %>% distinct()
omg_edge_df <- omg_edge_df %>% distinct()
cat("omg_edge_df duplicates: ", anyDuplicated(omg_edge_df), " \n") # after duplicates removed
## omg_edge_df duplicates: 0
cat("omg_price_df duplicates: ", anyDuplicated(omg_price_df), " \n") # after duplicates removed
## omg_price_df duplicates: 0
cat("trn_price_df duplicates: ", anyDuplicated(trn_price_df), " \n")
## trn_price_df duplicates: 0
cat("trn_edge_df duplicates: ", anyDuplicated(trn_edge_df), " \n")
## trn_edge_df duplicates: 1536
trn_price_df <- trn_price_df %>% distinct()
trn_edge_df <- trn_edge_df %>% distinct()
cat("trn_price_df duplicates: ", anyDuplicated(trn_price_df), " \n") # after duplicates removed
## trn_price_df duplicates: 0
cat("trn_edge_df duplicates: ", anyDuplicated(trn_edge_df), " \n") # after duplicates removed
## trn_edge_df duplicates: 0
cat("yoc_price_df duplicates: ", anyDuplicated(yoc_price_df), " \n")
## yoc_price_df duplicates: 0
cat("yoc_edge_df duplicates: ", anyDuplicated(yoc_edge_df), " \n")
## yoc_edge_df duplicates: 992
yoc_price_df <- yoc_price_df %>% distinct()
yoc_edge_df <- yoc_edge_df %>% distinct()
cat("yoc_price_df duplicates: ", anyDuplicated(yoc_price_df), " \n") # after duplicates removed
## yoc_price_df duplicates: 0
cat("yoc_edge_df duplicates: ", anyDuplicated(yoc_edge_df), " \n") # after duplicates removed
## yoc_edge_df duplicates: 0
Convert the date to the correct format in the price data frames.
omg_price_df$Date = as.Date(omg_price_df$Date,format='%m/%d/%Y')
trn_price_df$Date = as.Date(trn_price_df$Date,format='%m/%d/%y')
yoc_price_df$Date = as.Date(yoc_price_df$Date,format='%m/%d/%y')
Set our constants for each coin, then remove edge file rows where token amount is too big to make sense. Note: Only YOC had records needing to be removed.
omg_decimals = 10^18
trn_decimals = 10^6
yoc_decimals = 10^16
omg_supply = 140245398
trn_supply = 66682072191
yoc_supply = 369659255
omg_edge_df_filtered = omg_edge_df %>% filter(tokenAmount < omg_decimals * omg_supply)
cat("Num Rows before Filtering: ", nrow(omg_edge_df), "\n")
## Num Rows before Filtering: 1143029
cat("Num Rows after Filtering: ", nrow(omg_edge_df_filtered), "\n")
## Num Rows after Filtering: 1143018
cat("Num Rows cut: ", (nrow(omg_edge_df)-nrow(omg_edge_df_filtered)), "\n")
## Num Rows cut: 11
omg_edge_df = omg_edge_df %>% filter(tokenAmount <= omg_decimals * omg_supply)
trn_edge_df_filtered = trn_edge_df %>% filter(tokenAmount < trn_decimals*trn_supply)
cat("Num Rows before Filtering: ", nrow(trn_edge_df), "\n")
## Num Rows before Filtering: 1512662
cat("Num Rows after Filtering: ", nrow(trn_edge_df_filtered), "\n")
## Num Rows after Filtering: 1512580
cat("Num Rows cut: ", (nrow(trn_edge_df)-nrow(trn_edge_df_filtered)), "\n")
## Num Rows cut: 82
trn_edge_df = trn_edge_df %>% filter(tokenAmount <= trn_decimals * trn_supply)
yoc_edge_df_filtered = yoc_edge_df %>% filter(yoc_edge_df$tokenAmount < yoc_decimals * yoc_supply)
cat("Num Rows before Filtering: ", nrow(yoc_edge_df), "\n")
## Num Rows before Filtering: 595582
cat("Num Rows after Filtering: ", nrow(yoc_edge_df_filtered), "\n")
## Num Rows after Filtering: 595492
cat("Num Rows cut: ", (nrow(yoc_edge_df)-nrow(yoc_edge_df_filtered)), "\n")
## Num Rows cut: 90
yoc_edge_df = yoc_edge_df %>% filter(tokenAmount <= yoc_decimals * yoc_supply)
Update the edge data frame dates to be the correct format.
omg_edge_df$Date = anydate(omg_edge_df$unixTime)
trn_edge_df$Date = anydate(trn_edge_df$unixTime)
yoc_edge_df$Date = anydate(yoc_edge_df$unixTime)
yoc_price_df$Date = as.Date(yoc_price_df$Date,format='%m/%d/%Y')
trn_price_df$Date = as.Date(trn_price_df$Date,format='%m/%d/%Y')
omg_price_df$Date = as.Date(omg_price_df$Date,format='%m/%d/%Y')
yocoin_plot <- ggplot(aes(x=Date, y=Open), data = yoc_price_df) + geom_point(color="darkblue")
yocoin_plot + ggtitle("YOC Historical Data") +
xlab("Date") + ylab("Opening Price")
tron_plot <- ggplot(aes(x=Date, y=Open), data = trn_price_df) + geom_point(color="darkred")
tron_plot + ggtitle("TRX Historical Data") +
xlab("Date") + ylab("Opening Price")
omg_plot <- ggplot(aes(x=Date, y=Open), data = omg_price_df) + geom_point(color="darkgreen")
omg_plot + ggtitle("OMG Historical Data") +
xlab("Date") + ylab("Opening Price")
For the sake of basic analysis, we create a value in the dataframe for pairs of buyers and sellers.
yoc_edge_df$pairFrom <- paste(yoc_edge_df$fromID, '-', yoc_edge_df$toID)
yoc_edge_df$pairTo <- paste(yoc_edge_df$toID, '-', yoc_edge_df$fromID)
yocoin_pairFrom_counts <- as.data.frame(table(yoc_edge_df$pairFrom))
yocoin_pairTo_counts <- as.data.frame(table(yoc_edge_df$pairTo))
names(yocoin_pairFrom_counts) <- c('Pair', 'Transactions')
names(yocoin_pairTo_counts) <- c('Pair', 'Transactions')
yocoin_pairFrom_counts <- head(yocoin_pairFrom_counts, n = 100)
yocoin_pairTo_counts <- head(yocoin_pairTo_counts, n = 100)
trn_edge_df$pairFrom <- paste(trn_edge_df$fromID, '-', trn_edge_df$toID)
trn_edge_df$pairTo <- paste(trn_edge_df$toID, '-', trn_edge_df$fromID)
tron_pairFrom_counts <- as.data.frame(table(trn_edge_df$pairFrom))
tron_pairTo_counts <- as.data.frame(table(trn_edge_df$pairTo))
names(tron_pairFrom_counts) <- c('Pair', 'Transactions')
names(tron_pairTo_counts) <- c('Pair', 'Transactions')
tron_pairFrom_counts <- head(tron_pairFrom_counts, n = 100)
tron_pairTo_counts <- head(tron_pairTo_counts, n = 100)
omg_edge_df$pairFrom <- paste(omg_edge_df$fromID, '-', omg_edge_df$toID)
omg_edge_df$pairTo <- paste(omg_edge_df$toID, '-', omg_edge_df$fromID)
omg_pairFrom_counts <- as.data.frame(table(omg_edge_df$pairFrom))
omg_pairTo_counts <- as.data.frame(table(omg_edge_df$pairTo))
names(omg_pairFrom_counts) <- c('Pair', 'Transactions')
names(omg_pairTo_counts) <- c('Pair', 'Transactions')
omg_pairFrom_counts <- head(omg_pairFrom_counts, n = 100)
omg_pairTo_counts <- head(omg_pairTo_counts, n = 100)
In this section we plot the Number of Transactions against the pair of buyer and seller. This is just help us understand the distribution of transactions between user pairs. There are two graphs for each coin, respectiv to buyer and seller data.
omg_plot_transFrom <-
ggplot(aes(x=Pair, y=Transactions),
data = omg_pairFrom_counts) +
geom_bar(stat = "identity", color="green") +
geom_text(aes(label=Transactions))
omg_plot_transFrom +
ggtitle("OMG Transaction Data") +
xlab("Buyer and Seller Pair") +
ylab("Token Amount")
omg_plot_transTo <-
ggplot(aes(x=Pair, y=Transactions),
data = omg_pairTo_counts) +
geom_bar(stat = "identity", color="green") +
geom_text(aes(label=Transactions))
omg_plot_transTo +
ggtitle("OMG Transaction Data") +
xlab("Buyer and Seller Pair") +
ylab("Token Amount")
tron_plot_transFrom <-
ggplot(aes(x=Pair, y=Transactions),
data = tron_pairFrom_counts) +
geom_bar(stat = "identity", color="darkred") +
geom_text(aes(label=Transactions))
tron_plot_transFrom +
ggtitle("TRX Transaction Data") +
xlab("Buyer and Seller Pair") +
ylab("Token Amount")
tron_plot_transTo <-
ggplot(aes(x=Pair, y=Transactions),
data = tron_pairTo_counts) +
geom_bar(stat = "identity", color="darkred") +
geom_text(aes(label=Transactions))
tron_plot_transTo +
ggtitle("TRX Transaction Data") +
xlab("Buyer and Seller Pair") +
ylab("Token Amount")
yocoin_plot_transFrom <-
ggplot(aes(x=Pair, y=Transactions),
data = yocoin_pairFrom_counts) +
geom_bar(stat = "identity", color="darkblue") +
geom_text(aes(label = Transactions))
yocoin_plot_transFrom +
ggtitle("YOC Transaction Data") +
xlab("Buyer and Seller Pair") +
ylab("Number of Transactions")
yocoin_plot_transTo <-
ggplot(aes(x=Pair, y=Transactions),
data = yocoin_pairTo_counts) +
geom_bar(stat = "identity", color="darkblue") +
geom_text(aes(label = Transactions))
yocoin_plot_transTo +
ggtitle("YOC Transaction Data") +
xlab("Buyer and Seller Pair") +
ylab("Number of Transactions")
In this section we take the the same data from above and take into a narrower scope, only 20 of the top transaction amounts. From this we begin calculating the distributions of the transaction data.
yocoin_buys.distribution <- yoc_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
yocoin_sells.distribution <- yoc_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
tron_buys.distribution <- trn_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
tron_sells.distribution <- trn_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
omg_buys.distribution <- omg_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
omg_sells.distribution <- omg_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
print("Buys Top 20")
## [1] "Buys Top 20"
print(yocoin_buys.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
## toID n
## <dbl> <int>
## 1 9911653 14601
## 2 309659 6592
## 3 9912976 4423
## 4 9916042 4044
## 5 9915788 3990
## 6 9913800 3351
## 7 9911955 2518
## 8 9912979 2491
## 9 9911654 2153
## 10 9911658 2104
## 11 9916222 2076
## 12 9915420 2043
## 13 9912036 1910
## 14 9914909 1818
## 15 9915919 1736
## 16 9916338 1690
## 17 9915232 1668
## 18 9915389 1652
## 19 9913169 1651
## 20 9917297 1494
print(tron_buys.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
## toID n
## <dbl> <int>
## 1 5 86073
## 2 1752093 11234
## 3 1742290 8168
## 4 182337 7391
## 5 9353350 6093
## 6 75994 5967
## 7 40112 4660
## 8 26 4016
## 9 49 3292
## 10 40002 2804
## 11 75995 2520
## 12 1820 2518
## 13 1742287 2448
## 14 104502 1976
## 15 297031 1762
## 16 9245671 1747
## 17 40044 1718
## 18 104531 1690
## 19 60 1649
## 20 118 1539
print(omg_buys.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
## toID n
## <dbl> <int>
## 1 297278 64501
## 2 5 53758
## 3 311608 32541
## 4 36161 22877
## 5 75994 11124
## 6 297094 7893
## 7 296381 5523
## 8 1742290 5180
## 9 182337 4508
## 10 1739369 3729
## 11 297031 3619
## 12 142341 3394
## 13 49 2692
## 14 1741637 2295
## 15 303329 2111
## 16 75989 2016
## 17 104531 1943
## 18 298450 1896
## 19 297301 1866
## 20 48315 1496
print("Sells Top 20")
## [1] "Sells Top 20"
print(yocoin_sells.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
## fromID n
## <dbl> <int>
## 1 9911594 384348
## 2 9912282 36887
## 3 9916066 28026
## 4 9921392 23708
## 5 9915044 23346
## 6 9916176 21047
## 7 9915539 9228
## 8 9915042 9127
## 9 9916067 8404
## 10 309659 4093
## 11 9913938 2647
## 12 9913936 2418
## 13 9911653 1692
## 14 9913503 1349
## 15 9926337 822
## 16 9921021 648
## 17 9915105 602
## 18 9913429 591
## 19 9915975 570
## 20 9914909 567
print(tron_sells.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
## fromID n
## <dbl> <int>
## 1 5 156005
## 2 1742290 86914
## 3 82 56066
## 4 44 49618
## 5 49 45023
## 6 6 42470
## 7 17 23051
## 8 13 20717
## 9 9472639 20235
## 10 9472643 20228
## 11 9472641 20226
## 12 9472657 20220
## 13 9472667 20220
## 14 9472659 20190
## 15 9472637 20178
## 16 9472649 20168
## 17 9472665 20120
## 18 9472663 20040
## 19 9472633 19975
## 20 9472647 19948
print(omg_sells.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
## fromID n
## <dbl> <int>
## 1 17 203746
## 2 297278 52256
## 3 5 48461
## 4 311608 37141
## 5 36161 32198
## 6 13 21813
## 7 75994 19347
## 8 307831 18201
## 9 296792 18097
## 10 304118 17288
## 11 307797 16222
## 12 44 13037
## 13 296381 12880
## 14 82 12605
## 15 6 10805
## 16 297094 10665
## 17 256505 10631
## 18 297031 8572
## 19 75989 8068
## 20 49 6263
This section takes the seller and buyer distribution data, orders it by decreasing amount and finalizes the amount to 20. Once theses calculations are completed, we plot these data frames into respective bar charts and provide numeric values at the top of the bar. This data is far more useful in showing the transaction data.
yocoin_sell_df = yocoin_sells.distribution %>% arrange(-n) %>% head(20)
yocoin_sell_df$row_id <- as.numeric(row.names(yocoin_sell_df))
yocoin_sells_quant_bar = ggplot(data=yocoin_sell_df, aes(x=row_id, y=n)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=n), vjust=-0.3, size=3.5)+
theme_minimal()
print(yocoin_sells_quant_bar)
yocoin_buy_df = yocoin_buys.distribution %>% arrange(-n) %>% head(20)
yocoin_buy_df$row_id <- as.numeric(row.names(yocoin_buy_df))
yocoin_buys_quant_bar = ggplot(data=yocoin_buy_df, aes(x=row_id, y=n)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=n), vjust=-0.3, size=3.5)+
theme_minimal()
print(yocoin_buys_quant_bar)
tron_sell_df = yocoin_sells.distribution %>% arrange(-n) %>% head(20)
tron_sell_df$row_id <- as.numeric(row.names(tron_sell_df))
tron_sells_quant_bar = ggplot(data=tron_sell_df, aes(x=row_id, y=n)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=n), vjust=-0.3, size=3.5)+
theme_minimal()
print(tron_sells_quant_bar)
tron_buy_df = yocoin_buys.distribution %>% arrange(-n) %>% head(20)
tron_buy_df$row_id <- as.numeric(row.names(tron_buy_df))
tron_buys_quant_bar = ggplot(data=tron_buy_df, aes(x=row_id, y=n)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=n), vjust=-0.3, size=3.5)+
theme_minimal()
print(tron_buys_quant_bar)
omg_sell_df = omg_sells.distribution %>% arrange(-n) %>% head(20)
omg_sell_df$row_id <- as.numeric(row.names(omg_sell_df))
omg_sells_quant_bar = ggplot(data=omg_sell_df, aes(x=row_id, y=n)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=n), vjust=-0.3, size=3.5)+
theme_minimal()
print(omg_sells_quant_bar)
omg_buy_df = omg_buys.distribution %>% arrange(-n) %>% head(20)
omg_buy_df$row_id <- as.numeric(row.names(omg_buy_df))
omg_buys_quant_bar = ggplot(data=omg_buy_df, aes(x=row_id, y=n)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=n), vjust=-0.3, size=3.5)+
theme_minimal()
print(omg_buys_quant_bar)
This section is dedicated to making getting proper buying distributions (ordered and truncated to the top 20). We calculate the total volume of the transactions. After that we filter the data, apply some scaling attributes and normalize the data.
yocoin_by_pair_df = yoc_edge_df %>% group_by(pairFrom) %>% summarise(n = n()) %>% arrange(-n) %>% ungroup
print(yocoin_by_pair_df %>% head(20))
## # A tibble: 20 x 2
## pairFrom n
## <chr> <int>
## 1 9911594 - 9912976 4419
## 2 9911594 - 9916042 4035
## 3 9911594 - 9915788 3977
## 4 9911594 - 9913800 3306
## 5 9913938 - 9911653 2647
## 6 9913936 - 9911653 2418
## 7 9911594 - 9912979 2363
## 8 9911594 - 9911955 2315
## 9 9911594 - 9916222 1819
## 10 9911594 - 9911654 1797
## 11 9911594 - 9911658 1690
## 12 9911594 - 9913169 1630
## 13 9911594 - 9912036 1600
## 14 9911594 - 9915389 1594
## 15 9911594 - 9915919 1594
## 16 9911594 - 9915232 1550
## 17 9911594 - 9915420 1493
## 18 9911594 - 9913658 1485
## 19 9911594 - 9913895 1413
## 20 9911594 - 9916338 1382
tron_by_pair_df = trn_edge_df %>% group_by(pairFrom) %>% summarise(n = n()) %>% arrange(-n) %>% ungroup
print(yocoin_by_pair_df %>% head(20))
## # A tibble: 20 x 2
## pairFrom n
## <chr> <int>
## 1 9911594 - 9912976 4419
## 2 9911594 - 9916042 4035
## 3 9911594 - 9915788 3977
## 4 9911594 - 9913800 3306
## 5 9913938 - 9911653 2647
## 6 9913936 - 9911653 2418
## 7 9911594 - 9912979 2363
## 8 9911594 - 9911955 2315
## 9 9911594 - 9916222 1819
## 10 9911594 - 9911654 1797
## 11 9911594 - 9911658 1690
## 12 9911594 - 9913169 1630
## 13 9911594 - 9912036 1600
## 14 9911594 - 9915389 1594
## 15 9911594 - 9915919 1594
## 16 9911594 - 9915232 1550
## 17 9911594 - 9915420 1493
## 18 9911594 - 9913658 1485
## 19 9911594 - 9913895 1413
## 20 9911594 - 9916338 1382
omg_by_pair_df = omg_edge_df %>% group_by(pairFrom) %>% summarise(n = n()) %>% arrange(-n) %>% ungroup
print(yocoin_by_pair_df %>% head(20))
## # A tibble: 20 x 2
## pairFrom n
## <chr> <int>
## 1 9911594 - 9912976 4419
## 2 9911594 - 9916042 4035
## 3 9911594 - 9915788 3977
## 4 9911594 - 9913800 3306
## 5 9913938 - 9911653 2647
## 6 9913936 - 9911653 2418
## 7 9911594 - 9912979 2363
## 8 9911594 - 9911955 2315
## 9 9911594 - 9916222 1819
## 10 9911594 - 9911654 1797
## 11 9911594 - 9911658 1690
## 12 9911594 - 9913169 1630
## 13 9911594 - 9912036 1600
## 14 9911594 - 9915389 1594
## 15 9911594 - 9915919 1594
## 16 9911594 - 9915232 1550
## 17 9911594 - 9915420 1493
## 18 9911594 - 9913658 1485
## 19 9911594 - 9913895 1413
## 20 9911594 - 9916338 1382
yocoin_total_trade_volume = sum(yocoin_by_pair_df$n)
tron_total_trade_volume = sum(tron_by_pair_df$n)
omg_total_trade_volume = sum(omg_by_pair_df$n)
# Optionally Drop out the outlier pair(311608 - 311608), n(30024)
# Comment this line out if you want to leave it in
#yocoin_by_pair_df = yocoin_by_pair_df %>% filter(n < 30000)
#cat('FILTERING!\n')
# Filtering and scaling the data in different ways.
yocoin_pair_df = yocoin_by_pair_df %>% head(100)
yocoin_pair_df$row_id <- as.numeric(row.names(yocoin_pair_df))
yocoin_pair_df$n_scaled <- (yocoin_pair_df$n - min(yocoin_pair_df$n) + 0.001) / (max(yocoin_pair_df$n) - min(yocoin_pair_df$n) + 0.002)
tron_pair_df = tron_by_pair_df %>% head(100)
tron_pair_df$row_id <- as.numeric(row.names(tron_pair_df))
tron_pair_df$n_scaled <- (tron_pair_df$n - min(tron_pair_df$n) + 0.001) / (max(tron_pair_df$n) - min(tron_pair_df$n) + 0.002)
omg_pair_df = yocoin_by_pair_df %>% head(100)
omg_pair_df$row_id <- as.numeric(row.names(omg_pair_df))
omg_pair_df$n_scaled <- (omg_pair_df$n - min(omg_pair_df$n) + 0.001) / (max(omg_pair_df$n) - min(omg_pair_df$n) + 0.002)
# Reverse it so it goes up and right
# pairdf$n_scaled <- rev(pairdf$n_scaled)
# normalize by total_trade_volume
yocoin_pair_df$n_norm = yocoin_pair_df$n / yocoin_total_trade_volume
tron_pair_df$n_norm = tron_pair_df$n / tron_total_trade_volume
omg_pair_df$n_norm = omg_pair_df$n / omg_total_trade_volume
From here we take the data, and apply some different distributions. - Log Normal - Exponential - Geometric - Weibull - Gamma - Negative Binomial
as.data.frame(yocoin_pair_df)
as.data.frame(tron_pair_df)
as.data.frame(omg_pair_df)
keeps <- c("n_scaled", "n", "row_id")
yocoin_clean_data <- as.data.frame(yocoin_pair_df)[keeps]
tron_clean_data <- as.data.frame(tron_pair_df)[keeps]
omg_clean_data <- as.data.frame(omg_pair_df)[keeps]
fit.lnorm.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='lognormal')
fit.exp.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='exponential')
# fit.geom.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='geometric')
# fit.weibull.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='weibull')
fit.gamma.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='gamma')
fit.nbinomial.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='negative binomial')
fit.lnorm.tron_pair_df = fitdistr(tron_clean_data$n, densfun='lognormal')
fit.exp.tron_pair_df = fitdistr(tron_clean_data$n, densfun='exponential')
# fit.geom.tron_pair_df = fitdistr(tron_clean_data$n, densfun='geometric')
fit.weibull.tron_pair_df = fitdistr(tron_clean_data$n, densfun='weibull')
fit.gamma.tron_pair_df = fitdistr(tron_clean_data$n, densfun='gamma')
fit.nbinomial.tron_pair_df = fitdistr(tron_clean_data$n, densfun='negative binomial')
fit.lnorm.omg_pair_df = fitdistr(omg_clean_data$n, densfun='lognormal')
fit.exp.omg_pair_df = fitdistr(omg_clean_data$n, densfun='exponential')
# fit.geom.omg_pair_df = fitdistr(omg_clean_data$n, densfun='geometric')
# fit.weibull.omg_pair_df = fitdistr(omg_clean_data$n, densfun='weibull')
fit.gamma.omg_pair_df = fitdistr(omg_clean_data$n, densfun='gamma')
fit.nbinomial.omg_pair_df = fitdistr(omg_clean_data$n, densfun='negative binomial')
#print(fit.lnorm.pairdf)
# fit.weibull.pairdf = fitdistr(clean_data$n, densfun='weibull', start=list(shape=1, scale=500))
# print(fit.weibull.pairdf$estimate[1])
#print(fit.lnorm.pairdf$estimate[1])
#print(fit.exp.pairdf$estimate[1])
#print(fit.geom.pairdf$estimate[1])
This section is just to set up and show the plot for each of the coins and their processed data.
yocoin_pair_bar = ggplot(yocoin_clean_data) + geom_histogram(mapping = aes(x = n), stat = "density", fill="steelblue") +
stat_function( fun = "dlnorm",
args = list(meanlog = fit.lnorm.yocoin_pair_df$estimate[1], sdlog = fit.lnorm.yocoin_pair_df$estimate[2]),
n = 100,
size = 1,
color = "red") +
stat_function(fun = "dexp",
size = 1,
args = list(rate = fit.exp.yocoin_pair_df$estimate[1]),
color = "green") +
# stat_function(fun = "dweibull",
# size = 1,
# args = list(shape = fit.weibull.yocoin_pair_df$estimate[1],
# scale=fit.weibull.yocoin_pair_df$estimate[2]),
# color = "orange") +
stat_function(fun = "dgamma",
size = 1,
args = list(shape = fit.gamma.yocoin_pair_df$estimate[1],
rate=fit.gamma.yocoin_pair_df$estimate[2]),
color = "purple") +
#The ones below here are somewhat uselss for the full dataset
# stat_function(fun = "dgeom",
# size = 2,
# args = list(prob = fit.geom.yocoin_pair_df$estimate[1]),
# color = "blue") +
stat_function(fun = "dnbinom",
size = 1,
args = list(size = fit.nbinomial.yocoin_pair_df$estimate[1],
mu=fit.nbinomial.yocoin_pair_df$estimate[2]),
color = "pink") +
theme_minimal()
tron_pair_bar = ggplot(tron_clean_data) + geom_histogram(mapping = aes(x = n), stat = "density", fill="steelblue") +
stat_function( fun = "dlnorm",
args = list(meanlog = fit.lnorm.tron_pair_df$estimate[1], sdlog = fit.lnorm.tron_pair_df$estimate[2]),
n = 100,
size = 1,
color = "red") +
stat_function(fun = "dexp",
size = 1,
args = list(rate = fit.exp.tron_pair_df$estimate[1]),
color = "green") +
stat_function(fun = "dweibull",
size = 1,
args = list(shape = fit.weibull.tron_pair_df$estimate[1],
scale=fit.weibull.tron_pair_df$estimate[2]),
color = "orange") +
stat_function(fun = "dgamma",
size = 1,
args = list(shape = fit.gamma.tron_pair_df$estimate[1],
rate=fit.gamma.tron_pair_df$estimate[2]),
color = "purple") +
#The ones below here are somewhat uselss for the full dataset
# stat_function(fun = "dgeom",
# size = 2,
# args = list(prob = fit.geom.tron_pair_df$estimate[1]),
# color = "blue") +
stat_function(fun = "dnbinom",
size = 1,
args = list(size = fit.nbinomial.tron_pair_df$estimate[1],
mu=fit.nbinomial.tron_pair_df$estimate[2]),
color = "pink") +
theme_minimal()
omg_pair_bar = ggplot(omg_clean_data) + geom_histogram(mapping = aes(x = n), stat = "density", fill="steelblue") +
stat_function( fun = "dlnorm",
args = list(meanlog = fit.lnorm.omg_pair_df$estimate[1], sdlog = fit.lnorm.omg_pair_df$estimate[2]),
n = 100,
size = 1,
color = "red") +
stat_function(fun = "dexp",
size = 1,
args = list(rate = fit.exp.omg_pair_df$estimate[1]),
color = "green") +
# stat_function(fun = "dweibull",
# size = 1,
# args = list(shape = fit.weibull.omg_pair_df$estimate[1],
# scale=fit.weibull.omg_pair_df$estimate[2]),
# color = "orange") +
stat_function(fun = "dgamma",
size = 1,
args = list(shape = fit.gamma.omg_pair_df$estimate[1],
rate=fit.gamma.omg_pair_df$estimate[2]),
color = "purple") +
#The ones below here are somewhat uselss for the full dataset
# stat_function(fun = "dgeom",
# size = 2,
# args = list(prob = fit.geom.omg_pair_df$estimate[1]),
# color = "blue") +
stat_function(fun = "dnbinom",
size = 1,
args = list(size = fit.nbinomial.omg_pair_df$estimate[1],
mu=fit.nbinomial.omg_pair_df$estimate[2]),
color = "pink") +
theme_minimal()
# Can do this later...
# list(mean = fit$estimate[1], sd = fit$estimate[2]))
print(yocoin_pair_bar)
print(tron_pair_bar)
print(omg_pair_bar)
## Conclusions
Log norm seems to work best for all coins…
Determine some extrea features on which we can create our multiple linear regressions.
Calculate number of buys and sells by user_id Great description here: https://stackoverflow.com/questions/25869378/what-does-n-n-mean-in-r
omg_buys <- omg_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
trn_buys <- trn_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
yoc_buys <- yoc_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
omg_sells <- omg_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
trn_sells <- trn_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
yoc_sells <- yoc_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
Filter to only include top K buyers and build a dataframe with the summarized data for fitting a regression model. Features we create here include: - Avg_Tok_Amt: Average Token Amount traded for the top-K users on the given day - Tot_Tok_Amt: Total Token Amount traded by the top-K users on the given day - Transactions: Number of transactions by the top-K users on the given day - Distinct Buyers: Distinct number of buyers for a given day - Distinct Sellers: Distinct number of sellers for a given day
K_omg = 104
K_trn = 18000
K_yoc = 136
# Filter to only include top K buyers
omg_buys = omg_buys %>% arrange(-n) %>% head(K_omg)
trn_buys = trn_buys %>% arrange(-n) %>% head(K_trn)
yoc_buys = yoc_buys %>% arrange(-n) %>% head(K_yoc)
omg_top_k_buys <- omg_edge_df %>% filter(omg_edge_df$toID %in% omg_buys$toID)
trn_top_k_buys <- trn_edge_df %>% filter(trn_edge_df$toID %in% trn_buys$toID)
yoc_top_k_buys <- yoc_edge_df %>% filter(yoc_edge_df$toID %in% yoc_buys$toID)
# Create a dataframe with summarized data for fitting a regression model
omg_fit_data <- omg_top_k_buys %>% group_by(Date) %>%
summarise(
Avg_Tok_Amt = mean(tokenAmount),
Tot_Tok_Amt = sum(tokenAmount),
Transactions = n(),
Distinct_Buyers = n_distinct(toID),
Distinct_Sellers = n_distinct(fromID)
) %>%
ungroup
trn_fit_data <- trn_top_k_buys %>% group_by(Date) %>%
summarise(
Avg_Tok_Amt = mean(tokenAmount),
Tot_Tok_Amt = sum(tokenAmount),
Transactions = n(),
Distinct_Buyers = n_distinct(toID),
Distinct_Sellers = n_distinct(fromID)
) %>%
ungroup
yoc_fit_data <- yoc_top_k_buys %>% group_by(Date) %>%
summarise(
Avg_Tok_Amt = mean(tokenAmount),
Tot_Tok_Amt = sum(tokenAmount),
Transactions = n(),
Distinct_Buyers = n_distinct(toID),
Distinct_Sellers = n_distinct(fromID)
) %>%
ungroup
Join edge data to pricing data based on the Date. We lose a small percentage of the data here due to the fact that the timeframes for the two data files do not match perfectly.
omg_fit_data <- merge(omg_fit_data, omg_price_df, by="Date")
trn_fit_data <- merge(trn_fit_data, trn_price_df, by="Date")
yoc_fit_data <- merge(yoc_fit_data, yoc_price_df, by="Date")
Calculate the close values of the previous 3 days. Note: m1 refers to minus 1, i.e. one day previous
omg_fit_data$Close_m1 <- shift(omg_fit_data$Close, n=1)
omg_fit_data$Close_m2 <- shift(omg_fit_data$Close, n=2)
omg_fit_data$Close_m3 <- shift(omg_fit_data$Close, n=3)
trn_fit_data$Close_m1 <- shift(trn_fit_data$Close, n=1)
trn_fit_data$Close_m2 <- shift(trn_fit_data$Close, n=2)
trn_fit_data$Close_m3 <- shift(trn_fit_data$Close, n=3)
yoc_fit_data$Close_m1 <- shift(yoc_fit_data$Close, n=1)
yoc_fit_data$Close_m2 <- shift(yoc_fit_data$Close, n=2)
yoc_fit_data$Close_m3 <- shift(yoc_fit_data$Close, n=3)
Let’s take a look at our data with our newly engineered features on which we will fit our multiple regression model.
omg_fit_data
trn_fit_data
yoc_fit_data
Let’s also take a look at how many days are tracked for the three tokens in our data sets. We have the most data for YOC.
cat("OMG Rows: ", nrow(omg_fit_data), "\n")
## OMG Rows: 297
cat("TRX Rows: ", nrow(trn_fit_data), "\n")
## TRX Rows: 236
cat("YOC Rows: ", nrow(yoc_fit_data), "\n")
## YOC Rows: 422
We chose to regress to the Close value of the token, so we will compare the correlation of each of the regressors (Xs) to the Close (Y).
We can make the observation from this data that the previous day’s prices are far more correlated to the Close price on the day when compared to the token amounts, distinct buyers, and other features we engineered. This is expected.
cat("Transactions: ", cor(omg_fit_data$Close, omg_fit_data$Transactions), "\n")
## Transactions: 0.3133622
cat("Total Token Amount: ", cor(omg_fit_data$Close, omg_fit_data$Tot_Tok_Amt), "\n")
## Total Token Amount: -0.2566245
cat("Average Token Amount:", cor(omg_fit_data$Close, omg_fit_data$Avg_Tok_Amt), "\n")
## Average Token Amount: -0.4220751
cat("Distinct Buyers: ", cor(omg_fit_data$Close, omg_fit_data$Distinct_Buyers), "\n")
## Distinct Buyers: 0.6134334
cat("Distinct Sellers: ", cor(omg_fit_data$Close, omg_fit_data$Distinct_Sellers), "\n")
## Distinct Sellers: 0.2075846
cat("Close Minus 1: ", cor(omg_fit_data$Close, omg_fit_data$Close_m1, use = "complete.obs"), "\n")
## Close Minus 1: 0.9786079
cat("Close Minus 2: ", cor(omg_fit_data$Close, omg_fit_data$Close_m2, use = "complete.obs"), "\n")
## Close Minus 2: 0.9591295
cat("Close Minus 3: ", cor(omg_fit_data$Close, omg_fit_data$Close_m3, use = "complete.obs"), "\n")
## Close Minus 3: 0.9387131
cat("Transactions: ", cor(trn_fit_data$Close, trn_fit_data$Transactions), "\n")
## Transactions: 0.5137099
cat("Total Token Amount: ", cor(trn_fit_data$Close, trn_fit_data$Tot_Tok_Amt), "\n")
## Total Token Amount: 0.194449
cat("Average Token Amount:", cor(trn_fit_data$Close, trn_fit_data$Avg_Tok_Amt), "\n")
## Average Token Amount: -0.1125508
cat("Distinct Buyers: ", cor(trn_fit_data$Close, trn_fit_data$Distinct_Buyers), "\n")
## Distinct Buyers: 0.8720125
cat("Distinct Sellers: ", cor(trn_fit_data$Close, trn_fit_data$Distinct_Sellers), "\n")
## Distinct Sellers: 0.243085
cat("Close Minus 1: ", cor(trn_fit_data$Close, trn_fit_data$Close_m1, use = "complete.obs"), "\n")
## Close Minus 1: 0.9615547
cat("Close Minus 2: ", cor(trn_fit_data$Close, trn_fit_data$Close_m2, use = "complete.obs"), "\n")
## Close Minus 2: 0.9165609
cat("Close Minus 3: ", cor(trn_fit_data$Close, trn_fit_data$Close_m3, use = "complete.obs"), "\n")
## Close Minus 3: 0.86749
cat("Transactions: ", cor(yoc_fit_data$Close, yoc_fit_data$Transactions), "\n")
## Transactions: -0.03078097
cat("Total Token Amount: ", cor(yoc_fit_data$Close, yoc_fit_data$Tot_Tok_Amt), "\n")
## Total Token Amount: -0.2928949
cat("Average Token Amount:", cor(yoc_fit_data$Close, yoc_fit_data$Avg_Tok_Amt), "\n")
## Average Token Amount: -0.2878166
cat("Distinct Buyers: ", cor(yoc_fit_data$Close, yoc_fit_data$Distinct_Buyers), "\n")
## Distinct Buyers: 0.3815441
cat("Distinct Sellers: ", cor(yoc_fit_data$Close, yoc_fit_data$Distinct_Sellers), "\n")
## Distinct Sellers: 0.05851667
cat("Close Minus 1: ", cor(yoc_fit_data$Close, yoc_fit_data$Close_m1, use = "complete.obs"), "\n")
## Close Minus 1: 0.9806905
cat("Close Minus 2: ", cor(yoc_fit_data$Close, yoc_fit_data$Close_m2, use = "complete.obs"), "\n")
## Close Minus 2: 0.9751215
cat("Close Minus 3: ", cor(yoc_fit_data$Close, yoc_fit_data$Close_m3, use = "complete.obs"), "\n")
## Close Minus 3: 0.9702466
Time to actually perform the fit via multiple linear regression. We will split each coin into two different models. The first considering the previous 3 days of Close prices. The second only focusing on the features we engineered. As the previous three days were so highly correlated with the price, they make the R^2 value significantly higher and we lose some understanding of which one of the engineered features actually contributes the most.
Note: The models ending in “_hist" take the price history for the three previous days into account. Those with “_no_hist" endings do not.
omg_fit_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers +
Close_m1 +
Close_m2 +
Close_m3,
data=omg_fit_data)
omg_fit_no_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers,
data=omg_fit_data)
trn_fit_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers +
Close_m1 +
Close_m2 +
Close_m3,
data=trn_fit_data)
trn_fit_no_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers,
data=trn_fit_data)
yoc_fit_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers +
Close_m1 +
Close_m2 +
Close_m3,
data=yoc_fit_data)
yoc_fit_no_hist <- lm(
Close ~ Avg_Tok_Amt +
Tot_Tok_Amt +
Transactions +
Distinct_Buyers +
Distinct_Sellers,
data=yoc_fit_data)
print(summary(omg_fit_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers + Close_m1 + Close_m2 +
## Close_m3, data = omg_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8643 -0.4799 -0.0258 0.5055 4.9258
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.347e-02 2.614e-01 -0.281 0.7788
## Avg_Tok_Amt -6.106e-23 5.994e-23 -1.019 0.3092
## Tot_Tok_Amt 1.926e-25 8.781e-26 2.193 0.0291 *
## Transactions 1.266e-04 8.702e-05 1.455 0.1467
## Distinct_Buyers 1.225e-02 7.139e-03 1.716 0.0872 .
## Distinct_Sellers 7.825e-05 1.405e-04 0.557 0.5780
## Close_m1 9.173e-01 5.791e-02 15.842 <2e-16 ***
## Close_m2 3.376e-02 7.878e-02 0.429 0.6686
## Close_m3 -2.060e-03 5.695e-02 -0.036 0.9712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.056 on 285 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.96, Adjusted R-squared: 0.9589
## F-statistic: 855 on 8 and 285 DF, p-value: < 2.2e-16
plot(omg_fit_hist)
print(summary(omg_fit_no_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers, data = omg_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.2808 -3.2030 -0.2738 2.4096 11.4253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.341e+00 8.543e-01 3.911 0.000115 ***
## Avg_Tok_Amt -2.385e-22 1.331e-22 -1.791 0.074285 .
## Tot_Tok_Amt -1.296e-25 2.897e-25 -0.447 0.654941
## Transactions 1.448e-03 3.159e-04 4.585 6.75e-06 ***
## Distinct_Buyers 2.335e-01 2.283e-02 10.227 < 2e-16 ***
## Distinct_Sellers -1.334e-03 5.184e-04 -2.573 0.010573 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.99 on 291 degrees of freedom
## Multiple R-squared: 0.4418, Adjusted R-squared: 0.4322
## F-statistic: 46.06 on 5 and 291 DF, p-value: < 2.2e-16
plot(omg_fit_no_hist)
print(summary(trn_fit_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers + Close_m1 + Close_m2 +
## Close_m3, data = trn_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.030480 -0.002582 0.000472 0.001864 0.066202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.382e-03 8.301e-04 -1.665 0.09741 .
## Avg_Tok_Amt 1.331e-18 3.627e-17 0.037 0.97075
## Tot_Tok_Amt 1.182e-19 1.597e-19 0.740 0.45994
## Transactions -3.004e-06 1.580e-06 -1.901 0.05861 .
## Distinct_Buyers 3.328e-05 5.320e-06 6.256 1.98e-09 ***
## Distinct_Sellers 2.639e-06 1.730e-06 1.526 0.12846
## Close_m1 8.077e-01 5.785e-02 13.962 < 2e-16 ***
## Close_m2 4.101e-02 7.806e-02 0.525 0.59987
## Close_m3 -1.467e-01 5.462e-02 -2.685 0.00779 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008241 on 224 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.9525, Adjusted R-squared: 0.9508
## F-statistic: 561.2 on 8 and 224 DF, p-value: < 2.2e-16
plot(trn_fit_hist)
print(summary(trn_fit_no_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers, data = trn_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.080304 -0.007163 -0.001766 0.003777 0.068697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.037e-03 1.617e-03 -0.641 0.522
## Avg_Tok_Amt 1.178e-17 7.153e-17 0.165 0.869
## Tot_Tok_Amt -4.220e-19 3.077e-19 -1.372 0.172
## Transactions -1.989e-05 2.809e-06 -7.081 1.72e-11 ***
## Distinct_Buyers 1.264e-04 7.336e-06 17.229 < 2e-16 ***
## Distinct_Sellers 2.007e-05 3.113e-06 6.449 6.57e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01628 on 230 degrees of freedom
## Multiple R-squared: 0.8115, Adjusted R-squared: 0.8074
## F-statistic: 198.1 on 5 and 230 DF, p-value: < 2.2e-16
plot(trn_fit_no_hist)
print(summary(yoc_fit_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers + Close_m1 + Close_m2 +
## Close_m3, data = yoc_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.052084 -0.001512 -0.000703 0.001339 0.044489
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.171e-03 7.241e-04 1.617 0.106554
## Avg_Tok_Amt -2.921e-25 5.028e-25 -0.581 0.561621
## Tot_Tok_Amt -4.565e-30 4.740e-27 -0.001 0.999232
## Transactions -9.850e-07 9.762e-07 -1.009 0.313549
## Distinct_Buyers -1.245e-05 2.951e-05 -0.422 0.673353
## Distinct_Sellers 2.688e-06 5.304e-06 0.507 0.612531
## Close_m1 5.855e-01 5.028e-02 11.644 < 2e-16 ***
## Close_m2 2.244e-01 5.732e-02 3.915 0.000106 ***
## Close_m3 1.595e-01 5.007e-02 3.185 0.001558 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007628 on 410 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.9661, Adjusted R-squared: 0.9654
## F-statistic: 1461 on 8 and 410 DF, p-value: < 2.2e-16
plot(yoc_fit_hist)
print(summary(yoc_fit_no_hist))
##
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions +
## Distinct_Buyers + Distinct_Sellers, data = yoc_fit_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.064999 -0.026200 -0.005130 0.007418 0.138106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.553e-02 3.040e-03 11.688 < 2e-16 ***
## Avg_Tok_Amt -5.981e-24 2.411e-24 -2.481 0.0135 *
## Tot_Tok_Amt -4.511e-26 2.278e-26 -1.981 0.0483 *
## Transactions -2.628e-05 4.527e-06 -5.806 1.27e-08 ***
## Distinct_Buyers 1.118e-03 1.278e-04 8.749 < 2e-16 ***
## Distinct_Sellers 4.069e-05 2.521e-05 1.614 0.1072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03683 on 416 degrees of freedom
## Multiple R-squared: 0.2544, Adjusted R-squared: 0.2454
## F-statistic: 28.39 on 5 and 416 DF, p-value: < 2.2e-16
plot(yoc_fit_no_hist)
We find that including the last three days of close prices really overpowers any gains we make via our engineered regressors. All three give us values over .95 for R^2 which is great! Unfortunately it this will not be able to predict quick spikes or drops in the price as it is simply going to estimate a linear trajectory based on the previous days’ action.
If we disregard the previous days’ close prices, we are able to get the follwing R^2 values after [manually] experimenting with K values representing the top K buyers. - OMG: 0.4418 (K=104) - TRN: ~0.8115 (K=~18,000) - YOC: 0.2551 (K=135)
Note that TRN’s K value which produced the highest R^2 Value was exceptionally high compared to OMG and YOC. We plan to explore why this was the case in our writeup.